Climate Aid → Pollution — ADM2 Staggered DiD Exploration

EDGAR × GODAD — ADM2-level treatment (first project) & pollution outcomes

Author

Pierre Beaucoral

Published

October 9, 2025

Show code
suppressPackageStartupMessages({
  library(data.table); library(dplyr); library(tidyr)
  library(readr); library(arrow); library(sf)
  library(stringr); library(janitor)
  library(ggplot2); library(scales)
  # DiD
  library(did)            # Callaway & Sant'Anna
  library(fixest)         # sunab() if you also want SA later
})

options(datatable.print.nrows = 50)
setDTthreads(percent = 100)
Show code
params <- list(
  adm2_shp  = '/Users/pierrebeaucoral/Documents/Pro/Thèse CERDI/Recherche/GODAD/Data/GADM/gadm_410-levels_ADM_2.shp',
  edgar_cache_file = "/Users/pierrebeaucoral/Documents/Pro/Thèse CERDI/Recherche/GODAD/Data/Cache/edgar_adm2_panel.rds",
  edgar_years = 1997:2023,
  edgar_var = "CO2",
  godad_file = '/Users/pierrebeaucoral/Documents/Pro/Thèse CERDI/Recherche/GODAD/Data/climate_finance_total.csv',
  godad_gid2_col = 'gid_2',
  godad_year = 'closingyear',
  crs_target = 4326
)

1 Aim & design

We create an ADM2-year panel to study the effect of climate aid arrival on pollution outcomes from EDGAR. Treatment is defined as first year a climate project occurs in an ADM2 (staggered adoption). Outcome is the ADM2-area-weighted pollution mean.

2 Paths, helpers, and ADM2 layer

Show code
adm2 <- st_read(params$adm2_shp, quiet = TRUE) |> st_transform(params$crs_target)
nm <- names(adm2)
gid2_col  <- nm[str_detect(nm, "(?i)^GID_?2$|gid_?2|code_?2|adm2_id")][1]
name_col  <- nm[str_detect(nm, "(?i)^NAME_?2$|name_?2|adm2|district|county|province")][1]
if (is.na(gid2_col)) stop("Could not detect ADM2 code column in shapefile.")
if (is.na(name_col)) name_col <- gid2_col

adm2_key <- adm2 |>
  st_drop_geometry() |>
  transmute(adm2_id = .data[[gid2_col]],
            adm2_name = as.character(.data[[name_col]])) |>
  as.data.table()

3 EDGAR outcomes → ADM2-year

The code below recursively loads EDGAR data

Show code
stopifnot(file.exists(params$edgar_cache_file))
edgar_dt <- readRDS(params$edgar_cache_file) |> as.data.table()

# Expect columns: year, GID, co2_tonnes (from your glimpse)
edgar_dt[, year := as.integer(year)]
edgar_dt <- edgar_dt[year %in% params$edgar_years]

edgar_dt <- edgar_dt[, .(
  adm2_id = as.character(GID),
  year    = year,
  pollution_CO2 = as.numeric(co2_tonnes)
)]

# average duplicates if any
edgar_dt <- edgar_dt[, .(pollution_CO2 = mean(pollution_CO2, na.rm = TRUE)),
                     by = .(adm2_id, year)]

# Winsorized and log outcome
wfun <- function(x, p = 0.01) {
  ql <- quantile(x, p, na.rm = TRUE); qh <- quantile(x, 1-p, na.rm = TRUE)
  pmin(pmax(x, ql), qh)
}
edgar_dt[, pollution_CO2_w   := wfun(pollution_CO2)]
edgar_dt[, pollution_CO2_log := log1p(pollution_CO2)]

# base panel skeleton from EDGAR (ids present in outcomes)
ids   <- sort(unique(edgar_dt$adm2_id))
years <- sort(unique(edgar_dt$year))
base_panel <- CJ(adm2_id = ids, year = years)[edgar_dt, on = .(adm2_id, year)]
base_panel <- merge(base_panel, adm2_key, by = "adm2_id", all.x = TRUE)

3.1 Quick sanity

Show code
read_any <- function(path) {
  ext <- tolower(tools::file_ext(path))
  switch(ext,
         csv = readr::read_csv(path, show_col_types = FALSE),
         rds = readRDS(path),
         parquet = arrow::read_parquet(path),
         fst = fst::read_fst(path) |> as.data.frame(),
         stop("Unsupported GODAD format: ", ext))
}

godad <- read_any(params$godad_file) |> janitor::clean_names() |> as.data.table()
stopifnot(all(c(params$godad_gid2_col, params$godad_year) %in% names(godad)))

godad[, adm2_id := as.character(get(params$godad_gid2_col))]
godad[, year    := suppressWarnings(as.integer(get(params$godad_year)))]
godad <- godad[!is.na(adm2_id) & !is.na(year)]

# Flags from your schema
stopifnot(all(c("climate_relevance","meta_category") %in% names(godad)))
godad[, climate_relevance := as.integer(climate_relevance)]
godad[, meta_category := tolower(trimws(meta_category))]

godad[, is_climate     := climate_relevance == 1L]
godad[, is_nonclimate  := climate_relevance == 0L]
godad[, is_mitigation  := str_detect(meta_category, "\\bmitig")]     # robust
godad[, is_adaptation  := str_detect(meta_category, "\\badapt")]     # robust

# helper: first adoption year by ADM2
first_adopt <- function(dt_subset) {
  if (nrow(dt_subset) == 0L) return(data.table(adm2_id = character(), g = integer()))
  out <- dt_subset[, .(g = suppressWarnings(min(as.integer(year), na.rm = TRUE))), by = adm2_id]
  out[is.infinite(g), g := NA_integer_][]
}

adopt_adapt      <- first_adopt(godad[is_adaptation == TRUE])
adopt_mitig      <- first_adopt(godad[is_mitigation == TRUE])
adopt_climate    <- first_adopt(godad[is_climate == TRUE])
adopt_nonclimate <- first_adopt(godad[is_nonclimate == TRUE])

sapply(list(adapt = adopt_adapt, mitig = adopt_mitig, climate = adopt_climate, nonclimate = adopt_nonclimate), nrow)
     adapt      mitig    climate nonclimate 
      2132       2097       5781      16133 
Show code
yvar <- if ("pollution_CO2_log" %in% names(base_panel)) "pollution_CO2_log" else "pollution_CO2_w"

run_csdid <- function(base_panel, adopt_tbl, label, min_e = -5, max_e = 20) {
  P <- merge(base_panel[, .(adm2_id, year, y = get(yvar))],
             adopt_tbl, by = "adm2_id", all.x = TRUE)

  # Treated flags & event time
  P[, treated  := as.integer(!is.na(g) & year >= g)]
  P[, rel_time := ifelse(is.na(g), NA_integer_, year - g)]
  P[, adm2_id_int := as.integer(factor(adm2_id))]

  # Drop missing outcome and units treated in first overall year
  min_year <- min(P$year, na.rm = TRUE)
  D <- P[!is.na(y) & (is.na(g) | g > min_year)]

  if (nrow(D[!is.na(g)]) == 0L) {
    warning(sprintf("No treated units for %s — skipping.", label))
    return(NULL)
  }

  att <- did::att_gt(
    yname   = "y",
    tname   = "year",
    idname  = "adm2_id_int",
    gname   = "g",
    data    = D,
    panel   = TRUE,                 # repeated cross-sections at ADM2-year
    control_group = "notyettreated",
    allow_unbalanced_panel = TRUE
  )

  list(
    label = label,
    att   = att,
    es    = did::aggte(att, type = "dynamic", min_e = min_e, max_e = max_e),
    grp   = did::aggte(att, type = "group")
  )
}

4 CS DiD

We identify the first year in which an ADM2 receives at least one climate project. If the GODAD file already has an ADM2 code, we use it; if not, we perform a spatial join using the point geometry.

Show code
res_adapt <- run_csdid(base_panel, adopt_adapt,      "Adaptation")
res_mitig <- run_csdid(base_panel, adopt_mitig,      "Mitigation")
res_clim  <- run_csdid(base_panel, adopt_climate,    "All climate")
res_noncl <- run_csdid(base_panel, adopt_nonclimate, "Non-climate")

5 CS DiD Plot

Show code
make_es_df <- function(res) {
  if (is.null(res)) return(NULL)
  es <- res$es
  data.frame(label = res$label, egt = es$egt, att = es$att.egt, se = es$se.egt)
}

es_all <- rbind(
  make_es_df(res_adapt),
  make_es_df(res_mitig),
  make_es_df(res_clim),
  make_es_df(res_noncl)
)

if (!is.null(es_all)) {
  ggplot(es_all, aes(egt, att)) +
    geom_hline(yintercept = 0, linetype = 2) +
    geom_point() +
    geom_errorbar(aes(ymin = att - 1.96*se, ymax = att + 1.96*se), width = 0.15) +
    geom_vline(xintercept = -1, linetype = 2) +
    facet_wrap(~ label, ncol = 2, scales = "free_y") +
    labs(title = sprintf("Dynamic ATT (y = %s)", yvar),
         x = "Event time (t - g)", y = "ATT") +
    theme_minimal(base_size = 12)
}

Show code
summ_line <- function(res) {
  if (is.null(res)) return(NULL)
  s <- capture.output(print(summary(res$grp)))
  data.frame(model = res$label, summary = paste(s, collapse = "\n"))
}

summ_all <- dplyr::bind_rows(
  summ_line(res_adapt),
  summ_line(res_mitig),
  summ_line(res_clim),
  summ_line(res_noncl)
)

if (nrow(summ_all)) {
  cat("\n\n===== Overall ATT (group aggregated) =====\n")
  for (i in seq_len(nrow(summ_all))) {
    cat("\n--", summ_all$model[i], "--\n", summ_all$summary[i], "\n")
  }
}


===== Overall ATT (group aggregated) =====

-- Adaptation --
 
Call:
did::aggte(MP = att, type = "group")

Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 


Overall summary of ATT's based on group/cohort aggregation:  
     ATT    Std. Error     [ 95%  Conf. Int.]  
 -0.0304        0.0073    -0.0448     -0.0161 *


Group Effects:
 Group Estimate Std. Error [95% Simult.  Conf. Band]  
  2001  -0.3412     0.0829       -0.5780     -0.1044 *
  2002   0.1332     0.1383       -0.2616      0.5280  
  2003  -0.0154     0.0780       -0.2380      0.2073  
  2004   0.0655     0.0743       -0.1465      0.2775  
  2005  -0.1439     0.0362       -0.2472     -0.0407 *
  2006  -0.0614     0.0355       -0.1627      0.0400  
  2007  -0.1341     0.0218       -0.1964     -0.0719 *
  2008   0.0637     0.0559       -0.0958      0.2231  
  2009  -0.0647     0.0389       -0.1757      0.0463  
  2010  -0.0596     0.0595       -0.2296      0.1104  
  2011  -0.1459     0.0455       -0.2759     -0.0159 *
  2012  -0.0686     0.0267       -0.1449      0.0077  
  2013  -0.0129     0.0212       -0.0733      0.0475  
  2014  -0.0184     0.0267       -0.0946      0.0577  
  2015  -0.0100     0.0277       -0.0890      0.0690  
  2016   0.0033     0.0210       -0.0568      0.0633  
  2017   0.0093     0.0211       -0.0509      0.0695  
  2018  -0.0118     0.0214       -0.0730      0.0493  
  2019  -0.0092     0.0162       -0.0554      0.0371  
  2020   0.0122     0.0108       -0.0187      0.0431  
  2021  -0.0327     0.0189       -0.0866      0.0211  
  2022   0.0201     0.0100       -0.0083      0.0485  
---
Signif. codes: `*' confidence band does not cover 0

Control Group:  Not Yet Treated,  Anticipation Periods:  0
Estimation Method:  Doubly Robust
NULL 

-- Mitigation --
 
Call:
did::aggte(MP = att, type = "group")

Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 


Overall summary of ATT's based on group/cohort aggregation:  
     ATT    Std. Error     [ 95%  Conf. Int.] 
 -0.0155          0.01    -0.0352      0.0042 


Group Effects:
 Group Estimate Std. Error [95% Simult.  Conf. Band] 
  2001  -0.0536     0.0805       -0.2832      0.1761 
  2002   0.1492     0.3185       -0.7590      1.0575 
  2003   0.0288     0.0812       -0.2028      0.2604 
  2004   0.0377     0.1047       -0.2608      0.3363 
  2005  -0.0696     0.0657       -0.2569      0.1178 
  2006   0.0122     0.0959       -0.2614      0.2858 
  2007   0.1356     0.0800       -0.0925      0.3637 
  2008   0.0427     0.0517       -0.1047      0.1901 
  2009   0.0457     0.0413       -0.0722      0.1636 
  2010  -0.0180     0.0528       -0.1686      0.1327 
  2011  -0.0315     0.0369       -0.1367      0.0736 
  2012  -0.0129     0.0251       -0.0843      0.0586 
  2013  -0.0605     0.0312       -0.1494      0.0284 
  2014  -0.0593     0.0262       -0.1341      0.0154 
  2015  -0.0256     0.0308       -0.1135      0.0624 
  2016  -0.0415     0.0210       -0.1015      0.0185 
  2017  -0.0006     0.0211       -0.0607      0.0595 
  2018  -0.0347     0.0149       -0.0770      0.0077 
  2019  -0.0230     0.0195       -0.0785      0.0326 
  2020  -0.0027     0.0198       -0.0592      0.0538 
  2021  -0.0109     0.0147       -0.0530      0.0311 
  2022  -0.0091     0.0101       -0.0378      0.0197 
---
Signif. codes: `*' confidence band does not cover 0

Control Group:  Not Yet Treated,  Anticipation Periods:  0
Estimation Method:  Doubly Robust
NULL 

-- All climate --
 
Call:
did::aggte(MP = att, type = "group")

Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 


Overall summary of ATT's based on group/cohort aggregation:  
     ATT    Std. Error     [ 95%  Conf. Int.] 
 -0.0089        0.0062    -0.0211      0.0033 


Group Effects:
 Group Estimate Std. Error [95% Simult.  Conf. Band]  
  2001  -0.0905     0.0517       -0.2407      0.0596  
  2002   0.1256     0.0502       -0.0201      0.2714  
  2003  -0.0128     0.0445       -0.1418      0.1163  
  2004   0.0380     0.0585       -0.1318      0.2078  
  2005  -0.0527     0.0343       -0.1521      0.0468  
  2006   0.0165     0.0321       -0.0767      0.1096  
  2007  -0.0016     0.0186       -0.0555      0.0523  
  2008   0.0705     0.0302       -0.0172      0.1582  
  2009   0.0079     0.0203       -0.0511      0.0669  
  2010  -0.0023     0.0263       -0.0787      0.0741  
  2011  -0.0420     0.0221       -0.1063      0.0222  
  2012  -0.0108     0.0177       -0.0620      0.0405  
  2013  -0.0005     0.0128       -0.0377      0.0366  
  2014  -0.0278     0.0135       -0.0671      0.0114  
  2015  -0.0848     0.0182       -0.1375     -0.0320 *
  2016  -0.0209     0.0155       -0.0659      0.0241  
  2017   0.0172     0.0149       -0.0261      0.0605  
  2018  -0.0158     0.0151       -0.0597      0.0281  
  2019  -0.0224     0.0107       -0.0533      0.0086  
  2020   0.0017     0.0117       -0.0323      0.0358  
  2021  -0.0032     0.0136       -0.0428      0.0364  
  2022   0.0208     0.0070        0.0006      0.0410 *
---
Signif. codes: `*' confidence band does not cover 0

Control Group:  Not Yet Treated,  Anticipation Periods:  0
Estimation Method:  Doubly Robust
NULL 

-- Non-climate --
 
Call:
did::aggte(MP = att, type = "group")

Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 


Overall summary of ATT's based on group/cohort aggregation:  
    ATT    Std. Error     [ 95%  Conf. Int.]  
 0.0413        0.0076     0.0265      0.0561 *


Group Effects:
 Group Estimate Std. Error [95% Simult.  Conf. Band]  
  2001   0.0598     0.0174        0.0091      0.1105 *
  2002   0.0216     0.0190       -0.0337      0.0768  
  2003   0.0442     0.0183       -0.0090      0.0974  
  2004   0.1150     0.0206        0.0551      0.1749 *
  2005   0.0602     0.0168        0.0114      0.1091 *
  2006   0.0393     0.0151       -0.0047      0.0833  
  2007   0.0796     0.0171        0.0299      0.1293 *
  2008   0.0527     0.0211       -0.0087      0.1140  
  2009   0.0788     0.0176        0.0278      0.1298 *
  2010  -0.0241     0.0138       -0.0642      0.0160  
  2011   0.0102     0.0141       -0.0306      0.0511  
  2012   0.0631     0.0155        0.0180      0.1081 *
  2013   0.0451     0.0138        0.0050      0.0853 *
  2014   0.0439     0.0130        0.0061      0.0818 *
  2015   0.0008     0.0165       -0.0470      0.0486  
  2016   0.0465     0.0104        0.0161      0.0768 *
  2017  -0.0158     0.0113       -0.0487      0.0172  
  2018  -0.0058     0.0155       -0.0508      0.0392  
  2019   0.0618     0.0128        0.0247      0.0990 *
  2020  -0.0208     0.0132       -0.0592      0.0176  
  2021   0.0229     0.0166       -0.0253      0.0711  
  2022  -0.0495     0.0140       -0.0901     -0.0089 *
---
Signif. codes: `*' confidence band does not cover 0

Control Group:  Not Yet Treated,  Anticipation Periods:  0
Estimation Method:  Doubly Robust
NULL 
Show code
# Build a descriptive-sample panel using "All climate" adoption (change if desired)
descP <- merge(
  base_panel[, .(adm2_id, year, pollution_CO2, pollution_CO2_log)],
  adopt_climate, by = "adm2_id", all.x = TRUE
)
descP[, treated := as.integer(!is.na(g) & year >= g)]
descP[, ever_treated := as.integer(!is.na(g))]

# Helper: nice quantile summary
q_summ <- function(x) {
  c(N = sum(!is.na(x)),
    mean = mean(x, na.rm = TRUE),
    sd   = sd(x, na.rm = TRUE),
    p10  = quantile(x, .10, na.rm = TRUE),
    p50  = quantile(x, .50, na.rm = TRUE),
    p90  = quantile(x, .90, na.rm = TRUE))
}

5.0.1 Sample overview (counts, years, treated share)

Show code
library(data.table)

ov <- list(
  N_obs            = nrow(descP),
  N_ids            = uniqueN(descP$adm2_id),
  N_years          = uniqueN(descP$year),
  year_min_max     = paste(range(descP$year, na.rm = TRUE), collapse = "–"),
  N_ever_treated   = descP[, uniqueN(adm2_id[ever_treated == 1])],
  N_never_treated  = descP[, uniqueN(adm2_id[ever_treated == 0 | is.na(ever_treated)])],
  share_treated_by_2023 = with(descP[year == max(year, na.rm = TRUE)],
                               mean(ever_treated == 1, na.rm = TRUE))
)
as.data.table(ov)

5.0.2 Outcome summaries (overall & by ever-treated)

Show code
summ_all  <- q_summ(descP$pollution_CO2_log)
by_status <- descP[, as.list(q_summ(pollution_CO2_log)), by = ever_treated]

as.data.table(t(summ_all))[]
Show code
by_status[]

5.0.3 Adoption timing (cohort sizes, cumulative share)

Show code
library(ggplot2)

# Cohort histogram (first g per ADM2)
cohort_sizes <- adopt_climate[!is.na(g), .N, by = g][order(g)]
cohort_sizes%>%
  filter(g>=2000)%>%
ggplot(aes(g, N)) +
  geom_col() +
  labs(title = "Cohort size by first treatment year (All climate)",
       x = "First project year (g)", y = "ADM2 count") +
  theme_classic(base_size = 12)

Show code
# Cumulative treated share over time
# Cumulative treated share = fraction of ADM2 whose g <= year
share_treated <- descP[!is.na(g),
  .(share = mean(g <= year)), 
  by = year
]

# Also include never-treated in denominator
all_ids <- unique(descP$adm2_id)
total_n <- length(all_ids)

share_treated <- descP[, .(share = mean(!is.na(g) & g <= year)), by = year]

ggplot(share_treated, aes(year, share)) +
  geom_line() + geom_point() +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Share of ADM2 ever treated (cumulative)",
       x = NULL, y = NULL) +
  theme_classic(base_size = 12)

5.0.5 Raw event-time mean (not causal; descriptive)

Show code
descP[, rel_time := ifelse(is.na(g), NA_integer_, year - g)]
raw_es <- descP[!is.na(rel_time),
                .(y = mean(pollution_CO2_log, na.rm = TRUE)), by = rel_time]

ggplot(raw_es, aes(rel_time, y)) +
  geom_line() + geom_point() +
  geom_vline(xintercept = -1, linetype = 2) +
  labs(title = "Raw event-time mean: log(1+CO₂) (All climate)",
       x = "Event time (t - g)", y = "Mean log(1+CO₂)") +
  theme_classic(base_size = 12)

5.0.6 Top emitters (ADM2) — average over the sample window

Show code
# Extract ISO3 from GADM code pattern like "CHN.10.9_1"
descP[, iso3 := sub("\\..*$", "", adm2_id)]
top_adm2 <- descP[, .(avg_CO2 = mean(pollution_CO2, na.rm = TRUE)),
                  by = .(iso3, adm2_id)][order(-avg_CO2)][1:20]

top_adm2[]